library("parathyroidSE")
library("EnsDb.Hsapiens.v86")
library("dplyr")
library("ggplot2")
data("parathyroidGenesSE", package = "parathyroidSE")
genes <- read.csv(textConnection(
"name, group
ESR1, estrogen
ESR2, estrogen
CASR, parathyroid
VDR, parathyroid
JUN, parathyroid
CALR, parathyroid
ORAI2, parathyroid"),
stringsAsFactors = FALSE, strip.white = TRUE)
ens <- ensembldb::select(EnsDb.Hsapiens.v86,
keys = list(GenenameFilter(genes$name),
TxBiotypeFilter("protein_coding")),
columns = c("GENEID", "GENENAME"))
ens <-
dplyr::filter(ens, GENEID %in% rownames(parathyroidGenesSE)) %>%
mutate(group = genes$group[match(GENENAME, genes$name)])
countData <- assay( parathyroidGenesSE )
gene.counts <- t(countData[ens$GENEID, ])
colnames(gene.counts) <- ens$GENENAME
dat <- cbind(data.frame(colData( parathyroidGenesSE)), data.frame(gene.counts))
head(dat)
## run experiment patient treatment time submission study sample
## 1 SRR479052 SRX140503 1 Control 24h SRA051611 SRP012167 SRS308865
## 2 SRR479053 SRX140504 1 Control 48h SRA051611 SRP012167 SRS308866
## 3 SRR479054 SRX140505 1 DPN 24h SRA051611 SRP012167 SRS308867
## 4 SRR479055 SRX140506 1 DPN 48h SRA051611 SRP012167 SRS308868
## 5 SRR479056 SRX140507 1 OHT 24h SRA051611 SRP012167 SRS308869
## 6 SRR479057 SRX140508 1 OHT 48h SRA051611 SRP012167 SRS308870
## CALR CASR ESR1 ESR2 JUN ORAI2 VDR
## 1 5055 14525 2 1 1799 632 1167
## 2 6161 16870 3 3 1153 1424 1653
## 3 3333 7798 1 1 1086 309 688
## 4 6407 14299 2 1 1227 974 1407
## 5 3810 9235 4 1 1258 326 702
## 6 4885 12994 2 3 906 814 1235
Plot one of the estrogen related gene's counts (ESR1) with plot aesthetics and faceting to separate patients, treatments and times.
ggplot(dat, aes(col = patient, x = treatment, y = ESR1)) +
geom_point(size = 3) +
facet_grid( . ~ time)
From the plot of the parathyroid data, answer the following.
Quiz question 1 : How many patients are there?
Answer: There are four patients.
Quiz question 2 : How many time points are there?
Answer: There are two timepoints (24h, 48h).
Quiz question 3 : There were 3 treatments: "Control", "DPN", and "OHT". How many measurements were taken from patient 2 under the DPN treatment?
Answer: There were three measurements taken for patient 2 (green) under DPN treament.
Question: Make your own plot of VDR versus CASR. (That is CASR, not CALR).
Answer:
ggplot(dat, aes( x = VDR, y = CASR)) +
geom_point(aes(fill = patient, shape = treatment, alpha=time), size = 3) +
scale_alpha_manual(values = c(1.0, 0.7)) +
scale_shape_manual(values = c(21, 22, 24)) +
guides(fill = guide_legend(override.aes=list(shape=21))) +
facet_grid( . ~ time)
Quiz question 4 : Which patient has the highest recorded level of CASR?
Answer: Patient 2.
Quiz question 5 : Which of the following pairs of patients seem to be well separated in this plot (i.e., for which two patients you can draw a line on the plot that perfectly separates them)?
Answer: Patient 1 and patient 2, also Patient 1 and patient 3.
Quiz question 6 : You need to know which pairs of patients are well separated with respect to the genes VDR and CASR (i.e., you can draw a line on the plot that perfectly separates the patients). Which of the following methods can help you visualize this?
Answer: Plot VDR versus CASR, and change the shape of the point according to the patient. Plot VDR versus CASR, and color the points according to the patient.
Quiz question 7 : Which patient looks different from the other three when you plot VDR versus ORAI2?
Answer: Patient 2.
ggplot(dat, aes( x = VDR, y = ORAI2)) +
geom_point(aes(fill = patient, shape = treatment, alpha=time), size = 3) +
scale_alpha_manual(values = c(1.0, 0.7)) +
scale_shape_manual(values = c(21, 22, 24)) +
guides(fill = guide_legend(override.aes=list(shape=21))) +
facet_grid( . ~ time)
Quiz question 8 : Plot ORAI2 versus JUN. Which can you separate best?
Answer: 24 hours vs. 48 hours.
ggplot(dat, aes( x = JUN, y = ORAI2)) +
geom_point(aes(fill = time, shape = patient, alpha=treatment), size = 3) +
scale_alpha_manual(values = c(1.0, 0.7, 0.5)) +
scale_shape_manual(values = c(21, 22, 23,24)) +
guides(fill = guide_legend(override.aes=list(shape=21)))
Quiz question 9 : Plot CASR versus (ESR1 + ESR2). Fit a separate linear model for each treatment (Control, DPN, OHT). Which linear models are increasing?
Answer: DPN and OHT.
ggplot(dat, aes( y = CASR, x = ESR1 + ESR2)) +
geom_point(aes(fill = treatment, color=treatment, shape = patient, alpha=time), size = 3) +
geom_smooth(aes(color = treatment), method = "lm", se=FALSE)+
scale_alpha_manual(values = c(1.0, 0.6)) +
scale_shape_manual(values = c(21, 22, 23,24)) +
guides(fill = guide_legend(override.aes=list(shape=21)))
## `geom_smooth()` using formula 'y ~ x'
Quiz question 10 : What is the maximum number of shapes that you are allowed to use in ggplot2 by default?
Answer: 6.
df_shape <- data.frame(x=runif(10),y=runif(10), shapes=as.factor(1:10))
ggplot(df_shape, aes(x=x,y=y, shape=shapes)) + geom_point()
## Warning: The shape palette can deal with a maximum of 6 discrete values because
## more than 6 becomes difficult to discriminate; you have 10. Consider
## specifying shapes manually if you must have them.
## Warning: Removed 4 rows containing missing values (geom_point).
Quiz question 11 : Write the name of the function that you could use to make more than the maximum number of default shapes allowed. Hint: this function has "values" as one of the arguments ____(..., values = (...)).
Answer: scale_shape_manual.
Quiz question 12 : What do Themes do in ggplot2?
Answer: They control non-data components of the plot.
You will try to recreate a plot from an Economist article showing the relationship between well-being and financial inclusion.
You can find the accompanying article at this link
The data for the exercises EconomistData.csv can be downloaded from the class github repository.
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble 3.0.3 ✓ purrr 0.3.4
## ✓ tidyr 1.1.0 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ──────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::collapse() masks IRanges::collapse()
## x dplyr::combine() masks Biobase::combine(), BiocGenerics::combine()
## x dplyr::count() masks matrixStats::count()
## x dplyr::desc() masks IRanges::desc()
## x tidyr::expand() masks S4Vectors::expand()
## x dplyr::filter() masks ensembldb::filter(), stats::filter()
## x dplyr::first() masks S4Vectors::first()
## x dplyr::lag() masks stats::lag()
## x ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## x purrr::reduce() masks GenomicRanges::reduce(), IRanges::reduce()
## x dplyr::rename() masks S4Vectors::rename()
## x dplyr::select() masks ensembldb::select(), AnnotationDbi::select()
## x purrr::simplify() masks DelayedArray::simplify()
## x dplyr::slice() masks IRanges::slice()
url <- paste0("https://raw.githubusercontent.com/cme195/cme195.github.io/",
"master/assets/data/EconomistData.csv")
dat <- read_csv(url)
## Parsed with column specification:
## cols(
## Country = col_character(),
## SEDA.Current.level = col_double(),
## SEDA.Recent.progress = col_double(),
## Wealth.to.well.being.coefficient = col_double(),
## Growth.to.well.being.coefficient = col_double(),
## Percent.of.15plus.with.bank.account = col_double(),
## EPI_regions = col_character(),
## Region = col_character()
## )
head(dat)
## # A tibble: 6 x 8
## Country SEDA.Current.le… SEDA.Recent.pro… Wealth.to.well.… Growth.to.well.…
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Albania 50 63.3 1.27 1.31
## 2 Algeria 40.6 46.5 0.87 1.03
## 3 Angola 17.8 76.2 0.54 1.21
## 4 Argent… 54.1 49.1 0.91 0.89
## 5 Armenia 43.8 46 1.25 1.11
## 6 Austra… 87.9 40.9 1.07 0.92
## # … with 3 more variables: Percent.of.15plus.with.bank.account <dbl>,
## # EPI_regions <chr>, Region <chr>
Percent.of.15plus.with.bank.account column) and the y axis corresponds to the current SEDA score SEDA.Current.level.Region variable.geom_smooth to a low value and see what happens.geom_smooth with an appropriate method argument.Region.#1. Create a scatter plot with percent of people over the age of 15 with a bank
p <- ggplot(
dat, aes(x = Percent.of.15plus.with.bank.account, y = SEDA.Current.level))
p + geom_point()
#2. Color the points in the previous plot blue.
p + geom_point(color = "blue")
#3. Color the points in the previous plot according to the `Region`.
(p3 <- p + geom_point(aes(color = Region)))
# 4. Overlay a smoothing line on top of the scatter plot using the default method.
p3 + geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#4. Changing the span parameter
p3 + geom_smooth(span = 0.2)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#5. Overlay a smoothing line on top of the scatter plot using the lm method
(p5 <- p3 + geom_smooth(method = "lm"))
## `geom_smooth()` using formula 'y ~ x'
# 6. Facetting plots
p5 + facet_wrap(~ Region)
## `geom_smooth()` using formula 'y ~ x'
## Warning in qt((1 - level)/2, df): NaNs produced
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning -
## Inf
Region.ggplot(dat, aes(x = Region)) +
geom_bar() +
theme(axis.text.x = element_text(angle = 15, hjust = 1))
dat <- dat %>%
mutate(reg = reorder(Region, Region, function(x) -length(x)))
barplot <- ggplot(dat, aes(x = reg)) +
geom_bar() +
theme(axis.text.x = element_text(angle = 15, hjust = 1))
barplot
barplot + coord_flip()
SEDA.Current.level separately for each Region.plt <- ggplot(dat, aes(x = Region, y = SEDA.Current.level)) +
theme(axis.text.x = element_text(angle = 15, hjust = 1))
plt + geom_boxplot()
plt + geom_boxplot() + geom_point()
plt + geom_boxplot() + geom_jitter(width = 0.1)
plt + geom_violin() + geom_jitter(width = 0.1)
Below, I will show you how to obtain an 'Economist-look' for your scatter plot in few lines of code. To generate a replicate plot we need to:
Region column to a factor.colors <- c("#28AADC","#F2583F", "#76C0C1","#24576D", "#248E84","#DCC3AA", "#96503F")ggthemes package has a convenient functions for generating "Economist" style plots, e.g. theme_economist_white().First, change order of and lables for Regions
regions <- c("Europe", "Asia", "Oceania", "North America",
"Latin America & the Caribbean", "Middle East & North Africa",
"Sub-Saharan Africa")
# Here we are just modifying labels so that some names are on two lines
region_labels <- c("Europe", "Asia", "Oceania", "North America",
"Latin America & \n the Caribbean",
"Middle East & \n North Africa", "Sub-Saharan \n Africa")
dat <- dat %>%
mutate(
Region = as.character(Region),
Region = factor(Region, levels = regions, labels = region_labels)
)
custom_colors <- c("#28AADC","#F2583F", "#76C0C1","#24576D", "#248E84",
"#DCC3AA","#96503F")
p <- ggplot(
dat, aes(Percent.of.15plus.with.bank.account, SEDA.Current.level)) +
geom_point(aes(fill = Region), color = "white", size = 4, pch = 21) +
geom_smooth(method = "lm", se = FALSE, col = "black", size = 0.5) +
scale_fill_manual(name = "", values = custom_colors) +
coord_fixed(ratio = 0.4) +
scale_x_continuous(name = "% of people aged 15+ with bank account, 2014",
limits = c(0, 100),
breaks = seq(0, 100, by = 20)) +
scale_y_continuous(name = "SEDA Score, 100=maximum",
limits = c(0, 100),
breaks = seq(0, 100, by = 20)) +
labs(title="Laughing all the way to the bank",
subtitle="Well-being and financial inclusion* \n 2014-15")
p
## `geom_smooth()` using formula 'y ~ x'
To change the background and theme to match the 'Economist style', you can install the ggthemes package that implements the themes from:
#install.packages("ggthemes")
library(ggthemes)
(p <- p + theme_economist_white(gray_bg = FALSE))
## `geom_smooth()` using formula 'y ~ x'
Format the legend
p + theme(
text = element_text(color = "grey35", size = 11),
legend.text = element_text(size = 10),
legend.position = c(0.72, 1.12),
legend.direction = "horizontal") +
guides(fill = guide_legend(ncol = 4, byrow = FALSE))
## `geom_smooth()` using formula 'y ~ x'
# Choose a subset of countries
pointsToLabel <- c(
"Yemen", "Iraq", "Egypt", "Jordan", "Chad", "Congo", "Angola", "Albania",
"Zimbabwe", "Uganda", "Nigeria", "Uruguay", "Kazakhstan", "India", "Turkey",
"South Africa", "Kenya", "Russia", "Brazil", "Chile", "Saudi Arabia",
"Poland", "China", "Serbia", "United States", "United Kingdom")
# install.packages("ggrepel")
library(ggrepel)
(p <- p +
geom_text_repel(
aes(label = Country), color = "grey20",
data = dat %>% filter(Country %in% pointsToLabel),
force = 15))
## `geom_smooth()` using formula 'y ~ x'
library(readr)
url <- "http://cdiac.ess-dive.lbl.gov/ftp/ndp030/CSV-FILES/nation.1751_2014.csv"
emissions <- read_csv(url)
## Parsed with column specification:
## cols(
## Nation = col_character(),
## Year = col_double(),
## `Total CO2 emissions from fossil-fuels and cement production (thousand metric tons of C)` = col_double(),
## `Emissions from solid fuel consumption` = col_double(),
## `Emissions from liquid fuel consumption` = col_double(),
## `Emissions from gas fuel consumption` = col_double(),
## `Emissions from cement production` = col_double(),
## `Emissions from gas flaring` = col_character(),
## `Per capita CO2 emissions (metric tons of carbon)` = col_character(),
## `Emissions from bunker fuels (not included in the totals)` = col_double()
## )
## Warning: 615 parsing failures.
## row col expected actual file
## 1 NA 10 columns 1 columns 'http://cdiac.ess-dive.lbl.gov/ftp/ndp030/CSV-FILES/nation.1751_2014.csv'
## 2 NA 10 columns 1 columns 'http://cdiac.ess-dive.lbl.gov/ftp/ndp030/CSV-FILES/nation.1751_2014.csv'
## 3 NA 10 columns 1 columns 'http://cdiac.ess-dive.lbl.gov/ftp/ndp030/CSV-FILES/nation.1751_2014.csv'
## 3471 Emissions from solid fuel consumption a double . 'http://cdiac.ess-dive.lbl.gov/ftp/ndp030/CSV-FILES/nation.1751_2014.csv'
## 3471 Emissions from liquid fuel consumption a double . 'http://cdiac.ess-dive.lbl.gov/ftp/ndp030/CSV-FILES/nation.1751_2014.csv'
## .... ...................................... .......... ......... .........................................................................
## See problems(...) for more details.
emissions <- emissions[-(1:3), ]
emissions[emissions == "."] <- NA
emissions <- type_convert(emissions)
## Parsed with column specification:
## cols(
## Nation = col_character(),
## `Emissions from gas flaring` = col_double(),
## `Per capita CO2 emissions (metric tons of carbon)` = col_double()
## )
emissions
## # A tibble: 17,232 x 10
## Nation Year `Total CO2 emis… `Emissions from… `Emissions from…
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 AFGHA… 1949 4 4 0
## 2 AFGHA… 1950 23 6 18
## 3 AFGHA… 1951 25 7 18
## 4 AFGHA… 1952 25 9 17
## 5 AFGHA… 1953 29 10 18
## 6 AFGHA… 1954 29 12 18
## 7 AFGHA… 1955 42 17 25
## 8 AFGHA… 1956 50 17 33
## 9 AFGHA… 1957 80 21 59
## 10 AFGHA… 1958 90 25 65
## # … with 17,222 more rows, and 5 more variables: `Emissions from gas fuel
## # consumption` <dbl>, `Emissions from cement production` <dbl>, `Emissions
## # from gas flaring` <dbl>, `Per capita CO2 emissions (metric tons of
## # carbon)` <dbl>, `Emissions from bunker fuels (not included in the
## # totals)` <dbl>
dplyr functions to compute the total yearly \(CO_2\) emissions (column Total.CO2.emissions.from.fossil.fuels.and.cement.production..thousand.metric. tons.of.C.) summed over all countries (the world total \(CO_2\) emission). Use the dataset to plot the World's yearly \(CO_2\) emission in Gt.yearlyCO2 <- emissions %>%
group_by(Year) %>%
summarise(total_CO2 = sum(`Total CO2 emissions from fossil-fuels and cement production (thousand metric tons of C)`))
## `summarise()` ungrouping output (override with `.groups` argument)
yearlyCO2
## # A tibble: 264 x 2
## Year total_CO2
## <dbl> <dbl>
## 1 1751 2552
## 2 1752 2553
## 3 1753 2553
## 4 1754 2554
## 5 1755 2555
## 6 1756 2731
## 7 1757 2732
## 8 1758 2733
## 9 1759 2734
## 10 1760 2734
## # … with 254 more rows
top10 <- emissions %>%
filter(Year >= 2000) %>%
group_by(Nation) %>%
summarise(CountryCO2Post200 = sum(`Total CO2 emissions from fossil-fuels and cement production (thousand metric tons of C)`)) %>%
top_n(10) %>%
.[["Nation"]]
## `summarise()` ungrouping output (override with `.groups` argument)
## Selecting by CountryCO2Post200
Plot the yearly total CO2 emissions of these top 10 countries with a different color for each country. Use billion tonnes (Gt) units, i.e. divide the total emissions by 10^6.
ggplot(
emissions %>%
filter(Nation %in% top10),
aes(x = Year, y = `Total CO2 emissions from fossil-fuels and cement production (thousand metric tons of C)`/1e6)) +
geom_line(aes(color = Nation)) +
ylab("Total CO2 emission in Gt") +
theme_bw()
geom_area() to generate a plot similar to the one you generated above but, with emission levels stacked on top of each other (summing to the total for the ten countries) with areas colored by countries.df <- emissions %>% filter(Nation %in% top10)
ordered_countries <- df %>%
filter(Year == max(Year)) %>%
arrange(`Total CO2 emissions from fossil-fuels and cement production (thousand metric tons of C)`) %>% .[["Nation"]]
df <- df %>%
mutate(Nation = factor(Nation, levels = ordered_countries))
library(RColorBrewer)
colors10 <- colorRampPalette(brewer.pal(9, "Set2"))(10)
## Warning in brewer.pal(9, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
ggplot(
df,
aes(x = Year, y = `Total CO2 emissions from fossil-fuels and cement production (thousand metric tons of C)`/1e6)) +
geom_area(aes(fill = Nation), position = "stack") +
ylab("Total CO2 emission in Gt") +
scale_fill_manual(values = colors10) +
theme_bw()